HW 02

Author

Weston Scott

1 - A new day, a new plot, a new geom

edibnb <- dsbox::edibnb
glimpse(edibnb)
Rows: 13,245
Columns: 10
$ id                   <dbl> 15420, 24288, 38628, 44552, 47616, 48645, 51505, …
$ price                <dbl> 80, 115, 46, 32, 100, 71, 175, 150, 139, 190, 85,…
$ neighbourhood        <chr> "New Town", "Southside", NA, "Leith", "Southside"…
$ accommodates         <dbl> 2, 4, 2, 2, 2, 3, 5, 5, 6, 10, 2, 4, 3, 2, 2, 4, …
$ bathrooms            <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,…
$ bedrooms             <dbl> 1, 2, 0, 1, 1, 1, 2, 3, 4, 4, 1, 1, 1, 1, 1, 2, 1…
$ beds                 <dbl> 1, 2, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1, 1, 1, 1, 2, 1…
$ review_scores_rating <dbl> 99, 92, 94, 93, 98, 97, 100, 92, 96, 99, 77, 98, …
$ number_of_reviews    <dbl> 283, 199, 52, 184, 32, 762, 7, 28, 222, 142, 14, …
$ listing_url          <chr> "https://www.airbnb.com/rooms/15420", "https://ww…
summary(edibnb)
       id               price        neighbourhood       accommodates   
 Min.   :   15420   Min.   :  0.00   Length:13245       Min.   : 1.000  
 1st Qu.:13279107   1st Qu.: 49.00   Class :character   1st Qu.: 2.000  
 Median :20171841   Median : 75.00   Mode  :character   Median : 3.000  
 Mean   :20077242   Mean   : 97.21                      Mean   : 3.541  
 3rd Qu.:27397925   3rd Qu.:110.00                      3rd Qu.: 4.000  
 Max.   :36066014   Max.   :999.00                      Max.   :19.000  
                    NA's   :199                                         
   bathrooms        bedrooms           beds        review_scores_rating
 Min.   :0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 20.00      
 1st Qu.:1.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 93.00      
 Median :1.000   Median : 1.000   Median : 2.000   Median : 97.00      
 Mean   :1.226   Mean   : 1.583   Mean   : 2.032   Mean   : 95.02      
 3rd Qu.:1.000   3rd Qu.: 2.000   3rd Qu.: 3.000   3rd Qu.: 99.00      
 Max.   :9.000   Max.   :13.000   Max.   :30.000   Max.   :100.00      
 NA's   :12      NA's   :4        NA's   :15       NA's   :2177        
 number_of_reviews listing_url       
 Min.   :  0.00    Length:13245      
 1st Qu.:  2.00    Class :character  
 Median : 12.00    Mode  :character  
 Mean   : 37.73                      
 3rd Qu.: 45.00                      
 Max.   :773.00                      
                                     
edibnb <- edibnb |>
    mutate(
        neighbourhood = fct_reorder(
            neighbourhood, 
            review_scores_rating, 
            .fun = median)
    ) |>
    filter(!is.na(neighbourhood))
ggplot(
    data = edibnb, 
    aes(
        x = review_scores_rating, 
        y = neighbourhood, 
        fill = neighbourhood
    )
) +
    
geom_density_ridges(
    scale = 2,
    rel_min_height = 0.01,
    legend.show = FALSE,
    alpha = 0.8
) +

scale_fill_viridis_d(
    option = "C", 
    begin = 0.1, 
    end = 0.9
) +

scale_y_discrete(expand = c(0, 0)) + 
scale_x_continuous(
    expand = c(0, 0.25),
    limits = c(90, 100)
) + 

coord_cartesian(clip = "off") +
labs(
    title = "Problem 1 - Ridgeline plot", 
    subtitle = "Airbnb listings: Edinburgh, Scotland",
    x = "Review Score Ratings",
    y = "Edinburgh\nNeighborhoods", 
    caption = "Source: Opensource dataset dsbox::edibnb"
) +

theme_ridges() +
theme(legend.position = "none")

Intepretation

The ridgeline plot above visualizes the distribution of Airbnb review scores across different Edinburgh neighborhoods, ordered by their respective median review scores. The neighborhoods with the highest overall reviews appear at the top of the plot with a descending order down the plot to the neighborhoods with the lowest review scores. Most review scores cluster tightly between 90 and 100 (x-axis), suggesting generally positive experiences overall in the set of reviews. However, some neighborhoods display broader distributions or lower medians. A broader distribution indicates that the reviews have a larger spread along the review spectrum.

2 - Foreign Connected PACs

# get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# read all files and row bind them
# keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year")
glimpse(pac)
Rows: 2,394
Columns: 6
$ year                               <chr> "data/Foreign Connected PACs, 1999-…
$ `PAC Name (Affiliate)`             <chr> "7-Eleven", "ABB Group", "Accenture…
$ `Country of Origin/Parent Company` <chr> "Japan/Ito-Yokado", "Switzerland/As…
$ Total                              <chr> "$8500", "$46000", "$75984", "$3850…
$ Dems                               <chr> "$1500", "$17000", "$23000", "$1250…
$ Repubs                             <chr> "$7000", "$28500", "$52984", "$2600…
pac <- pac |>
    clean_names() |>
    separate(
        country_of_origin_parent_company,
        into = c("country", "parent_company"),
        sep = "/", 
        remove = TRUE) |>

    mutate(
        year = str_extract(year, "\\d{4}-\\d{4}"),
        year = str_extract(year, "\\d{4}$"),
        year = as.integer(year)
    ) |>

    select(-total)

pac
# A tibble: 2,394 × 6
    year pac_name_affiliate                  country parent_company dems  repubs
   <int> <chr>                               <chr>   <chr>          <chr> <chr> 
 1  2000 7-Eleven                            Japan   Ito-Yokado     $1500 $7000 
 2  2000 ABB Group                           Switze… Asea Brown Bo… $170… $28500
 3  2000 Accenture                           UK      Accenture plc  $230… $52984
 4  2000 ACE INA                             UK      ACE Group      $125… $26000
 5  2000 Acuson Corp (Siemens AG)            Germany Siemens AG     $2000 $0    
 6  2000 Adtranz (DaimlerChrysler)           Germany DaimlerChrysl… $100… $500  
 7  2000 AE Staley Manufacturing (Tate & Ly… UK      Tate & Lyle    $100… $14000
 8  2000 AEGON USA (AEGON NV)                Nether… Aegon NV       $105… $47750
 9  2000 AIM Management Group                UK      AMVESCAP       $100… $15000
10  2000 Air Liquide America                 France  L'Air Liquide… $0    $0    
# ℹ 2,384 more rows
pac <- pac |>
    pivot_longer(
        cols = c(dems, repubs),
        names_to = "party",
        values_to = "amount"
    ) |>

    mutate(
        amount = str_remove(amount, "\\$"),
        amount = as.integer(amount)
    )

pac
# A tibble: 4,788 × 6
    year pac_name_affiliate       country     parent_company    party  amount
   <int> <chr>                    <chr>       <chr>             <chr>   <int>
 1  2000 7-Eleven                 Japan       Ito-Yokado        dems     1500
 2  2000 7-Eleven                 Japan       Ito-Yokado        repubs   7000
 3  2000 ABB Group                Switzerland Asea Brown Boveri dems    17000
 4  2000 ABB Group                Switzerland Asea Brown Boveri repubs  28500
 5  2000 Accenture                UK          Accenture plc     dems    23000
 6  2000 Accenture                UK          Accenture plc     repubs  52984
 7  2000 ACE INA                  UK          ACE Group         dems    12500
 8  2000 ACE INA                  UK          ACE Group         repubs  26000
 9  2000 Acuson Corp (Siemens AG) Germany     Siemens AG        dems     2000
10  2000 Acuson Corp (Siemens AG) Germany     Siemens AG        repubs      0
# ℹ 4,778 more rows
uk_spending <- pac |>
    filter(country == "UK") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>

    arrange(year, party)

uk_spending
# A tibble: 24 × 3
    year party   totals
   <int> <chr>    <int>
 1  2000 dems    975725
 2  2000 repubs 2057518
 3  2002 dems   1046183
 4  2002 repubs 2002772
 5  2004 dems   1188801
 6  2004 repubs 2311101
 7  2006 dems   1543755
 8  2006 repubs 3057736
 9  2008 dems   2690413
10  2008 repubs 2842956
# ℹ 14 more rows
ggplot(data = uk_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +
scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +
scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from UK-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +
theme_minimal() +
theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0),
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

swiss_spending <- pac |>
    filter(country == "Switzerland") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>

    arrange(year, party)


ggplot(data = swiss_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +
scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +
scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from Swiss-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +
theme_minimal() +
theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

Intepretation

Contributions from Swiss-connected PACs to U.S. political parties have grown since the year 2000. The peak around key election years. The data shows a clear preference for Republican candidates, especially from 2008 onward. This might reflect Swiss alignment of ideologies or policies with Republican platforms. In contrast, Democratic contributions also grew, though they remained more modest and stable over the yearly span of this dataset.

3 - Median housing prices in the US

median_housing <- read_csv("data/median-housing.csv")

median_housing <- median_housing |>
  rename(date = DATE) |>
  rename(price = MSPUS)
glimpse(median_housing)
Rows: 234
Columns: 2
$ date  <date> 1963-01-01, 1963-04-01, 1963-07-01, 1963-10-01, 1964-01-01, 196…
$ price <dbl> 17800, 18000, 17900, 18500, 18500, 18900, 18900, 19400, 20200, 1…
recessions <- read_csv("data/recessions.csv")
glimpse(recessions)
Rows: 34
Columns: 2
$ Peak   <date> 1857-06-01, 1860-10-01, 1865-04-01, 1869-06-01, 1873-10-01, 18…
$ Trough <date> 1858-12-01, 1861-06-01, 1867-12-01, 1870-12-01, 1879-03-01, 18…
ggplot(data = median_housing, 
       aes(x = date, y = price)) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, 
                                         big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Source: Census; HUD"
) +
theme_minimal() +
theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

recessions <- recessions |>
    mutate(
        is_recess = if_else(Peak >= as.Date("1963-01-01", 
                                            format = "%Y-%m-%d") & 
                            Trough <= as.Date("2021-04-01", 
                                              format = "%Y-%m-%d"),
                            TRUE, FALSE)
    ) |>
    filter(is_recess == TRUE)
glimpse(recessions)
Rows: 8
Columns: 3
$ Peak      <date> 1969-12-01, 1973-11-01, 1980-01-01, 1981-07-01, 1990-07-01,…
$ Trough    <date> 1970-11-01, 1975-03-01, 1980-07-01, 1982-11-01, 1991-03-01,…
$ is_recess <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
ggplot(data = median_housing, 
       aes(x = date, 
           y = price)) +

geom_rect(
    data = recessions,
    aes(
        xmin = as.Date(Peak), 
        xmax = as.Date(Trough),
        ymin = -Inf, 
        ymax = Inf,
        y = NULL,
        x = NULL
    ), 
    fill = "cornsilk3"
    ) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Shaded areas indicate U.S. recessions\nSource: Census; HUD"
) +
theme_minimal() +
theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

quarters <- median_housing |>
    mutate(
        year = year(date),
        quarter = paste0("Q", quarter(date))
        ) |>
    arrange(date) |>
    filter(year %in% c(2019, 2020)) |> glimpse()
Rows: 8
Columns: 4
$ date    <date> 2019-01-01, 2019-04-01, 2019-07-01, 2019-10-01, 2020-01-01, 2…
$ price   <dbl> 313000, 322500, 318400, 327100, 329000, 322600, 337500, 358700
$ year    <dbl> 2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020
$ quarter <chr> "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4"
ggplot(data = quarters, 
       aes(x = date, 
           y = price,
          group = 1)) +

geom_line(color = "darkblue") +
scale_y_continuous(limits = c(300000, 360000),
                   breaks = seq(300000, 360000, by = 20000),
                   labels = label_number(accuracy = 1, big.mark = ","),
                   expand = c(0.008, 0.008)) +

scale_x_date(breaks = quarters$date,
             labels = quarters$quarter,
             expand = c(0.008, 0.008)) +

geom_point(color = "darkblue", 
           size = 2, 
           shape = 21, 
           fill = "white") +

annotate("text", x = as.Date("2019-05-15"), y = 301000, label = "2019", size = 4) +
annotate("text", x = as.Date("2020-05-15"), y = 301000, label = "2020", size = 4) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
) +

theme_minimal() +
theme(
    panel.grid.minor.x = element_blank(),
)

4 - Expect More. Plot More.

5 - Mirror, mirror on the wall, who’s the ugliest of them all?